# clear your environment
rm(list = ls(all=TRUE))
input_data = read.csv('trainDF_for_Clustering_21-15-35.csv', header = T)
str(input_data)
## 'data.frame': 11071 obs. of 20 variables:
## $ CustomerID : Factor w/ 11071 levels "C12116","C12117",..: 1 2 3 4 5 6 7 8 9 10 ...
## $ ProfessionalLicensure : int 0 1 0 0 1 0 0 1 0 1 ...
## $ eCommerceAccount : int 1 1 1 1 1 1 1 1 0 1 ...
## $ SocialMediaAccount : Factor w/ 2 levels "No","Yes": 1 2 2 1 1 1 1 2 2 1 ...
## $ NoOfProperties : int 2 1 1 1 1 1 4 1 1 2 ...
## $ CreditRiskScore : int 711 767 827 795 827 738 670 674 560 744 ...
## $ age_in_years : int 50 38 49 43 44 40 37 36 37 40 ...
## $ Total_no_of_loan_tenure_with_current: int 24 10 35 35 35 27 27 27 11 27 ...
## $ Asset_type : Factor w/ 2 levels "Complex With shared services",..: 2 2 2 1 2 2 2 2 2 2 ...
## $ Villa_House : Factor w/ 2 levels "No","Yes": 2 2 2 1 2 1 2 2 2 2 ...
## $ Investment_SelfOccupied : Factor w/ 2 levels "Investment","Self Occupancy": 2 2 2 2 2 2 1 2 2 2 ...
## $ Payment_Status : Factor w/ 3 levels "Default","Non-Payoff/Non-Default",..: 1 1 2 2 2 2 2 2 1 2 ...
## $ Remaining_loan_tenure : int 65 104 45 46 47 80 80 80 98 81 ...
## $ ratio_RE_inflation : num 2.03 1.1 2.27 2.25 2.26 ...
## $ ratio_Loan_to_Appraisedvalu_Percent : num 0.379 0.894 0.397 0.396 0.371 ...
## $ ratio_Interestrate : num 1 1.292 1.001 1 0.922 ...
## $ ratio_outstanding : num 0.765 0.979 0.887 0.88 0.826 ...
## $ diff_Salary_Utility : num 745 1155 1137 837 668 ...
## $ ratio_curr_outstanding_Salary : num 23.5 98 23.2 22 40.1 ...
## $ Avg_transaction_reqd : num 30 16.7 44.7 46.1 47.5 ...
summary(input_data)
## CustomerID ProfessionalLicensure eCommerceAccount SocialMediaAccount
## C12116 : 1 Min. :0.0000 Min. :0.000 No :3465
## C12117 : 1 1st Qu.:0.0000 1st Qu.:0.000 Yes:7606
## C12118 : 1 Median :1.0000 Median :1.000
## C12119 : 1 Mean :0.5722 Mean :0.523
## C12120 : 1 3rd Qu.:1.0000 3rd Qu.:1.000
## C12121 : 1 Max. :1.0000 Max. :1.000
## (Other):11065
## NoOfProperties CreditRiskScore age_in_years
## Min. :1.000 Min. :350.0 Min. :23.00
## 1st Qu.:1.000 1st Qu.:497.0 1st Qu.:37.00
## Median :1.000 Median :641.0 Median :38.00
## Mean :1.429 Mean :627.2 Mean :39.04
## 3rd Qu.:2.000 3rd Qu.:777.0 3rd Qu.:40.00
## Max. :4.000 Max. :830.0 Max. :70.00
##
## Total_no_of_loan_tenure_with_current Asset_type
## Min. :10.00 Complex With shared services: 757
## 1st Qu.:14.00 No shred services :10314
## Median :23.00
## Mean :24.34
## 3rd Qu.:33.00
## Max. :51.00
##
## Villa_House Investment_SelfOccupied Payment_Status
## No :4199 Investment :1670 Default :3327
## Yes:6872 Self Occupancy:9401 Non-Payoff/Non-Default:3543
## Payoff :4201
##
##
##
##
## Remaining_loan_tenure ratio_RE_inflation
## Min. : 1.00 Min. :0.6965
## 1st Qu.: 82.00 1st Qu.:0.7783
## Median : 92.00 Median :0.8610
## Mean : 88.84 Mean :0.9714
## 3rd Qu.:104.00 3rd Qu.:1.0925
## Max. :150.00 Max. :2.8494
##
## ratio_Loan_to_Appraisedvalu_Percent ratio_Interestrate ratio_outstanding
## Min. :0.05706 Min. :0.2247 Min. :0.1510
## 1st Qu.:0.86134 1st Qu.:1.0000 1st Qu.:0.9341
## Median :1.12380 Median :1.0000 Median :0.9691
## Mean :1.05316 Mean :1.2105 Mean :0.9465
## 3rd Qu.:1.26403 3rd Qu.:1.0508 3rd Qu.:0.9872
## Max. :1.54046 Max. :8.0250 Max. :1.1452
##
## diff_Salary_Utility ratio_curr_outstanding_Salary Avg_transaction_reqd
## Min. : 515.2 Min. : 0.00 Min. :-496.50
## 1st Qu.: 1051.9 1st Qu.: 45.40 1st Qu.: 15.00
## Median : 1505.1 Median : 72.31 Median : 25.00
## Mean : 3729.7 Mean : 89.51 Mean : 26.29
## 3rd Qu.: 3686.5 3rd Qu.: 103.57 3rd Qu.: 34.47
## Max. :99612.8 Max. :2084.67 Max. :4110.60
##
input_data$ProfessionalLicensure = as.factor(as.character(input_data$ProfessionalLicensure))
input_data$eCommerceAccount = as.factor(as.character(input_data$eCommerceAccount))
input_data$NoOfProperties = as.factor(as.character(input_data$NoOfProperties))
#Now see the structure of the dataframe
str(input_data)
## 'data.frame': 11071 obs. of 20 variables:
## $ CustomerID : Factor w/ 11071 levels "C12116","C12117",..: 1 2 3 4 5 6 7 8 9 10 ...
## $ ProfessionalLicensure : Factor w/ 2 levels "0","1": 1 2 1 1 2 1 1 2 1 2 ...
## $ eCommerceAccount : Factor w/ 2 levels "0","1": 2 2 2 2 2 2 2 2 1 2 ...
## $ SocialMediaAccount : Factor w/ 2 levels "No","Yes": 1 2 2 1 1 1 1 2 2 1 ...
## $ NoOfProperties : Factor w/ 4 levels "1","2","3","4": 2 1 1 1 1 1 4 1 1 2 ...
## $ CreditRiskScore : int 711 767 827 795 827 738 670 674 560 744 ...
## $ age_in_years : int 50 38 49 43 44 40 37 36 37 40 ...
## $ Total_no_of_loan_tenure_with_current: int 24 10 35 35 35 27 27 27 11 27 ...
## $ Asset_type : Factor w/ 2 levels "Complex With shared services",..: 2 2 2 1 2 2 2 2 2 2 ...
## $ Villa_House : Factor w/ 2 levels "No","Yes": 2 2 2 1 2 1 2 2 2 2 ...
## $ Investment_SelfOccupied : Factor w/ 2 levels "Investment","Self Occupancy": 2 2 2 2 2 2 1 2 2 2 ...
## $ Payment_Status : Factor w/ 3 levels "Default","Non-Payoff/Non-Default",..: 1 1 2 2 2 2 2 2 1 2 ...
## $ Remaining_loan_tenure : int 65 104 45 46 47 80 80 80 98 81 ...
## $ ratio_RE_inflation : num 2.03 1.1 2.27 2.25 2.26 ...
## $ ratio_Loan_to_Appraisedvalu_Percent : num 0.379 0.894 0.397 0.396 0.371 ...
## $ ratio_Interestrate : num 1 1.292 1.001 1 0.922 ...
## $ ratio_outstanding : num 0.765 0.979 0.887 0.88 0.826 ...
## $ diff_Salary_Utility : num 745 1155 1137 837 668 ...
## $ ratio_curr_outstanding_Salary : num 23.5 98 23.2 22 40.1 ...
## $ Avg_transaction_reqd : num 30 16.7 44.7 46.1 47.5 ...
# extract the names of categorical columns
categorical_cols = c()
for (coln in names(input_data)){
if (is.factor(input_data[[coln]])){
categorical_cols = append(categorical_cols,coln)
}
}
categorical_cols = setdiff(categorical_cols,"CustomerID")
print(categorical_cols)
## [1] "ProfessionalLicensure" "eCommerceAccount"
## [3] "SocialMediaAccount" "NoOfProperties"
## [5] "Asset_type" "Villa_House"
## [7] "Investment_SelfOccupied" "Payment_Status"
#Now, how to find the numerical attributes?
numerical_cols = setdiff(colnames(input_data), categorical_cols)
numerical_cols = setdiff(numerical_cols,"CustomerID")
numerical_cols
## [1] "CreditRiskScore"
## [2] "age_in_years"
## [3] "Total_no_of_loan_tenure_with_current"
## [4] "Remaining_loan_tenure"
## [5] "ratio_RE_inflation"
## [6] "ratio_Loan_to_Appraisedvalu_Percent"
## [7] "ratio_Interestrate"
## [8] "ratio_outstanding"
## [9] "diff_Salary_Utility"
## [10] "ratio_curr_outstanding_Salary"
## [11] "Avg_transaction_reqd"
rownames(input_data) <- input_data$CustomerID
customerID = input_data$CustomerID
input_data$CustomerID = NULL
# OR
# input_data <- input_data[, -c(colnames(input_data) %in% ("name"))]
sum(is.na(input_data))
## [1] 0
# check no missing value in categorical column
# sum(is.na(input_data$shelf))
# had there been any missing value we would have imputed it seperately from numerical using modes.
# library(DMwR)
# input_data[,numerical_cols] <- knnImputation(input_data[,numerical_cols], k = 3, scale = T)
#
# sum(is.na(input_data))
# Make a copy of the dataframe for later use (mixed attributes)
input_data_ori = input_data
# library("dummies")
# shelfDummies = data.frame(dummy(input_data$shelf))
#
# # name the new attributes appropriately
# names(shelfDummies) = c("Shelf1","Shelf2","Shelf3")
# head(shelfDummies)
#
# library("dummies")
#
# dummifiedVarDF = list()
# for (i in 1:length(categorical_cols)){
# dummifiedVarDF[[i]] = data.frame(dummy(input_data[[categorical_cols[i]]]))
# names(dummifiedVarDF[[i]]) = levels(input_data[[categorical_cols[i]]])
# input_data[[categorical_cols[i]]] = NULL
# input_data = data.frame(cbind(input_data,dummifiedVarDF[[i]]))
# }
#
# # print(dummifiedVarDF[1])
# head(input_data)
library("dummies")
## dummies-1.5.6 provided by Decision Patterns
ProfessionalLicensure_Dummies = data.frame(dummy(input_data$ProfessionalLicensure))
# name the new attributes appropriately
names(ProfessionalLicensure_Dummies) = c("ProfessionalLicensure0","ProfessionalLicensure1")
# head(ProfessionalLicensure_Dummies)
_ eCommerceAccount _
eCommerceAccount_Dummies = data.frame(dummy(input_data$eCommerceAccount))
# name the new attributes appropriately
names(eCommerceAccount_Dummies) = c("eCommerceAccount0","eCommerceAccount1")
# head(eCommerceAccount_Dummies)
_ SocialMediaAccount _
SocialMediaAccount_Dummies = data.frame(dummy(input_data$SocialMediaAccount))
# naming the new attributes appropriately
names(SocialMediaAccount_Dummies) = c("SocialMediaAccount_No","SocialMediaAccount_Yes")
# head(SocialMediaAccount_Dummies)
_ NoOfProperties _
NoOfProperties_Dummies = data.frame(dummy(input_data$NoOfProperties))
# naming the new attributes appropriately
names(NoOfProperties_Dummies) = c("NoOfProperties_1","NoOfProperties_2", "NoOfProperties_3","NoOfProperties_4")
# head(NoOfProperties_Dummies)
__ Asset_type __
Asset_type_Dummies = data.frame(dummy(input_data$Asset_type))
# naming the new attributes appropriately
names(Asset_type_Dummies) = c("Asset_type_Complex_With_shared_services", "Asset_type_No_shred_services")
# head(Asset_type_Dummies)
__ Villa_House __
Villa_House_Dummies = data.frame(dummy(input_data$Villa_House))
# naming the new attributes appropriately
names(Villa_House_Dummies) = c("Villa_House_No","Villa_House_Yes")
# head(Villa_House_Dummies)
__ Investment_SelfOccupied __
Investment_SelfOccupied_Dummies = data.frame(dummy(input_data$Investment_SelfOccupied))
# naming the new attributes appropriately
names(Investment_SelfOccupied_Dummies) = c("Investment_SelfOccupied_Investment", "Investment_SelfOccupied_Self_Occupancy")
# head(Investment_SelfOccupied_Dummies)
__ Payment_Status __
Payment_Status_Dummies = data.frame(dummy(input_data$Payment_Status))
# naming the new attributes appropriately
names(Payment_Status_Dummies) = c("Payment_Status_Default", "Payment_Status_Non-Payoff/Non-Default", "Payment_Status_Payoff")
# head(Payment_Status_Dummies)
input_data$ProfessionalLicensure = NULL
input_data$eCommerceAccount = NULL
input_data$SocialMediaAccount = NULL
input_data$NoOfProperties = NULL
input_data$Asset_type = NULL
input_data$Villa_House = NULL
input_data$Investment_SelfOccupied = NULL
input_data$Payment_Status = NULL
input_data = data.frame(cbind(input_data, ProfessionalLicensure_Dummies, eCommerceAccount_Dummies, SocialMediaAccount_Dummies, NoOfProperties_Dummies, Asset_type_Dummies, Villa_House_Dummies, Investment_SelfOccupied_Dummies, Payment_Status_Dummies))
input_data[, numerical_cols] = scale(input_data[,numerical_cols], center = T, scale = T)
In the partitioning approach, you specify K: the number of clusters sought. Observations are then randomly divided into K groups and reshufled to form cohesive clusters.
library(animation)
## Warning: package 'animation' was built under R version 3.5.2
set.seed(123)
km_ani <- kmeans.ani(subset(input_data, select=c("diff_Salary_Utility", "CreditRiskScore")), centers = 3)
As we can see, it needed few iterations to find final clusters
set.seed(123)
km_basic <- kmeans(input_data, centers = 3, nstart = 4)
# plot in 2-d
str(km_basic)
## List of 9
## $ cluster : Named int [1:11071] 2 3 2 2 2 3 3 3 1 3 ...
## ..- attr(*, "names")= chr [1:11071] "C12116" "C12117" "C12118" "C12119" ...
## $ centers : num [1:3, 1:30] -0.906 1.131 0.719 -0.325 2.034 ...
## ..- attr(*, "dimnames")=List of 2
## .. ..$ : chr [1:3] "1" "2" "3"
## .. ..$ : chr [1:30] "CreditRiskScore" "age_in_years" "Total_no_of_loan_tenure_with_current" "Remaining_loan_tenure" ...
## $ totss : num 159124
## $ withinss : num [1:3] 47566 26307 45177
## $ tot.withinss: num 119051
## $ betweenss : num 40073
## $ size : int [1:3] 5167 1063 4841
## $ iter : int 3
## $ ifault : int 0
## - attr(*, "class")= chr "kmeans"
library(factoextra)
## Warning: package 'factoextra' was built under R version 3.5.2
## Loading required package: ggplot2
## Welcome! Related Books: `Practical Guide To Cluster Analysis in R` at https://goo.gl/13EFCZ
# plot in 2-d
fviz_cluster(km_basic, input_data, labelsize = 7)
set.seed(123)
fviz_nbclust(input_data, kmeans, method = "wss")
set.seed(123)
km_clust <- kmeans(input_data, centers = 3, nstart = 4)
# Store the cluster assignments in a new data frame
input_clusts_km <- as.data.frame(cbind(clust = km_clust$cluster, input_data))
# Look at the head of the data
head(input_clusts_km)
## clust CreditRiskScore age_in_years
## C12116 2 0.5662109 2.6969057
## C12117 3 0.9444832 -0.2559390
## C12118 2 1.3497751 2.4508353
## C12119 2 1.1336194 0.9744130
## C12120 2 1.3497751 1.2204833
## C12121 3 0.7485922 0.2362018
## Total_no_of_loan_tenure_with_current Remaining_loan_tenure
## C12116 -0.03227037 -1.0423143
## C12117 -1.36712851 0.6626760
## C12118 1.01654674 -1.9166683
## C12119 1.01654674 -1.8729506
## C12120 1.01654674 -1.8292329
## C12121 0.25377066 -0.3865488
## ratio_RE_inflation ratio_Loan_to_Appraisedvalu_Percent
## C12116 3.8164550 -2.6254003
## C12117 0.4794188 -0.6191181
## C12118 4.6861716 -2.5561725
## C12119 4.6245219 -2.5583645
## C12120 4.6505310 -2.6586567
## C12121 -0.8634853 0.8826714
## ratio_Interestrate ratio_outstanding diff_Salary_Utility
## C12116 -0.23521295 -1.9582842 -0.4181081
## C12117 0.09080138 0.3542952 -0.3606889
## C12118 -0.23429193 -0.6412755 -0.3631720
## C12119 -0.23521295 -0.7240622 -0.4052008
## C12120 -0.32280141 -1.3030235 -0.4289440
## C12121 -0.13753936 -0.1880079 -0.3571597
## ratio_curr_outstanding_Salary Avg_transaction_reqd
## C12116 -0.7814550 0.08201288
## C12117 0.1005737 -0.21563142
## C12118 -0.7848104 0.41180602
## C12119 -0.7986421 0.44235934
## C12120 -0.5845045 0.47539331
## C12121 -0.1667940 0.14422126
## ProfessionalLicensure0 ProfessionalLicensure1 eCommerceAccount0
## C12116 1 0 0
## C12117 0 1 0
## C12118 1 0 0
## C12119 1 0 0
## C12120 0 1 0
## C12121 1 0 0
## eCommerceAccount1 SocialMediaAccount_No SocialMediaAccount_Yes
## C12116 1 1 0
## C12117 1 0 1
## C12118 1 0 1
## C12119 1 1 0
## C12120 1 1 0
## C12121 1 1 0
## NoOfProperties_1 NoOfProperties_2 NoOfProperties_3 NoOfProperties_4
## C12116 0 1 0 0
## C12117 1 0 0 0
## C12118 1 0 0 0
## C12119 1 0 0 0
## C12120 1 0 0 0
## C12121 1 0 0 0
## Asset_type_Complex_With_shared_services
## C12116 0
## C12117 0
## C12118 0
## C12119 1
## C12120 0
## C12121 0
## Asset_type_No_shred_services Villa_House_No Villa_House_Yes
## C12116 1 0 1
## C12117 1 0 1
## C12118 1 0 1
## C12119 0 1 0
## C12120 1 0 1
## C12121 1 1 0
## Investment_SelfOccupied_Investment
## C12116 0
## C12117 0
## C12118 0
## C12119 0
## C12120 0
## C12121 0
## Investment_SelfOccupied_Self_Occupancy Payment_Status_Default
## C12116 1 1
## C12117 1 1
## C12118 1 0
## C12119 1 0
## C12120 1 0
## C12121 1 0
## Payment_Status_Non.Payoff.Non.Default Payment_Status_Payoff
## C12116 0 0
## C12117 0 0
## C12118 1 0
## C12119 1 0
## C12120 1 0
## C12121 1 0
fviz_cluster(km_clust, input_data, labelsize = 7)
library(cluster)
## Warning: package 'cluster' was built under R version 3.5.2
pamx <- pam(input_data, 3, )
# pamx
fviz_cluster(pamx, labelsize = 7)